<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>ArXiv CS.CV Papers (Image/Video Generation) - April 27, 2025</title>
    <script src="https://cdn.tailwindcss.com"></script>
    <script src="https://cdnjs.cloudflare.com/ajax/libs/framer-motion/10.16.4/framer-motion.dev.js"></script>
    <!-- Example using Font Awesome (replace with your preferred icon library if needed) -->
    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/all.min.css">
    <style>
        @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;600;700&display=swap');

        :root {
            /* New Palette: Light, Clean, Futuristic with Teal/Aqua accents */
            --bg-color: #f8fafc; /* Tailwind slate-50 (Very Light Gray) */
            --card-bg-color: #ffffff; /* White */
            --text-color: #1e293b; /* Tailwind slate-800 (Dark Gray-Blue) */
            --text-muted-color: #64748b; /* Tailwind slate-500 (Medium Gray-Blue) */
            --header-color: #0f172a; /* Tailwind slate-900 (Very Dark Blue) */
            --highlight-primary: #14b8a6; /* Tailwind teal-500 */
            --highlight-secondary: #67e8f9; /* Tailwind cyan-300 */
            --border-color: #e2e8f0; /* Tailwind slate-200 (Light Gray) */
            --shadow-color: rgba(15, 23, 42, 0.08); /* Subtle shadow based on slate-900 */
        }

        body {
            background-color: var(--bg-color);
            color: var(--text-color);
            font-family: 'Inter', sans-serif;
            overflow-x: hidden; /* Prevent horizontal scroll */
            line-height: 1.6;
        }

        .bento-grid {
            display: grid;
            gap: 1.5rem; /* Tailwind gap-6 */
            grid-template-columns: 1fr; /* Force single column */
            padding-bottom: 4rem; /* Add padding at the bottom */
        }

        .bento-item {
            /* Apply semi-transparent white background and blur */
            background-color: rgba(255, 255, 255, 0.7); /* White with 70% opacity */
            backdrop-filter: blur(10px); /* Apply blur effect */
            -webkit-backdrop-filter: blur(10px); /* Safari prefix */
            border-radius: 1rem; /* Slightly larger radius */
            padding: 1.75rem; /* Slightly more padding */
            border: 1px solid rgba(226, 232, 240, 0.5); /* Lighter border with transparency */
            box-shadow: 0 4px 12px var(--shadow-color);
            transition: transform 0.3s ease-out, box-shadow 0.3s ease-out, background-color 0.3s ease-out;
            overflow: hidden; /* Ensure content doesn't overflow */
            position: relative; /* For potential pseudo-elements */
        }

        /* Removed ::before pseudo-element for a cleaner look */


        .bento-item:hover {
            transform: translateY(-6px);
            box-shadow: 0 10px 20px var(--shadow-color), 0 4px 8px rgba(15, 23, 42, 0.06); /* Adjusted hover shadow */
        }

        .paper-title {
            font-size: 1.125rem; /* Tailwind text-lg */
            font-weight: 600; /* Tailwind font-semibold */
            color: var(--highlight-primary); /* Use new primary highlight */
            margin-bottom: 0.75rem; /* Tailwind mb-3 */
            line-height: 1.4;
        }

        .paper-summary {
            font-size: 0.875rem; /* Tailwind text-sm */
            color: var(--text-muted-color);
            margin-bottom: 1.25rem; /* Tailwind mb-5 */
            line-height: 1.6;
        }

        .paper-link {
            display: inline-flex; /* Use flex for icon alignment */
            align-items: center;
            font-size: 0.875rem; /* Tailwind text-sm */
            font-weight: 600;
            color: var(--highlight-primary);
            text-decoration: none;
            padding: 0.5rem 1rem; /* Add padding */
            border-radius: 0.5rem; /* Slightly rounder */
            background-color: rgba(20, 184, 166, 0.08); /* Subtle teal background */
            border: 1px solid rgba(20, 184, 166, 0.2);
            transition: background-color 0.3s ease, color 0.3s ease, transform 0.2s ease;
        }

        .paper-link i {
            margin-right: 0.5rem; /* Tailwind mr-2 */
            transition: transform 0.3s ease;
        }

        .paper-link:hover {
            background-color: rgba(20, 184, 166, 0.15);
            color: #0d9488; /* Darker teal on hover */
            transform: translateY(-1px);
        }
        .paper-link:hover i {
             transform: translateX(2px);
        }

        .paper-authors {
            font-size: 0.75rem; /* Tailwind text-xs */
            color: var(--text-muted-color);
            margin-top: 1rem; /* Tailwind mt-4 */
            font-style: italic;
        }

        .header {
            text-align: center;
            margin-bottom: 3rem; /* Tailwind mb-12 */
            padding-top: 3rem; /* Tailwind pt-12 */
        }

        .header h1 {
            font-size: 2.5rem; /* Tailwind text-4xl or 5xl */
            font-weight: 700; /* Tailwind font-bold */
            color: var(--header-color);
            letter-spacing: -0.025em; /* Tailwind tracking-tight */
            margin-bottom: 0.5rem;
            /* Optional: Add a subtle text gradient */
            /* background: linear-gradient(90deg, var(--highlight-primary), var(--highlight-secondary)); */
            /* -webkit-background-clip: text; */
            /* -webkit-text-fill-color: transparent; */
        }

        .header p {
            font-size: 1.125rem; /* Tailwind text-lg */
            color: var(--text-muted-color);
            margin-top: 0.5rem; /* Tailwind mt-2 */
            max-width: 600px;
            margin-left: auto;
            margin-right: auto;
        }

        .footer {
            text-align: center;
            color: var(--text-muted-color);
            font-size: 0.875rem; /* Tailwind text-sm */
            padding-top: 2rem;
            padding-bottom: 2rem; /* Tailwind py-8 */
            border-top: 1px solid var(--border-color);
            margin-top: 4rem;
        }

        /* Simple line graphic element (optional) */
        .line-graphic {
            height: 1px; /* Thinner line */
            background: linear-gradient(90deg, rgba(20, 184, 166, 0), var(--highlight-primary), rgba(20, 184, 166, 0));
            opacity: 0.6;
            margin: 1.5rem 0; /* Adjust margin */
        }

        /* Framer Motion requires the script, styles enhance appearance */
        [data-motion-element] {
             /* Base styles for elements animated by Framer Motion */
        }

        .paper-tldr {
            font-size: 0.95rem; /* Slightly bigger than summary */
            color: #475569; /* Changed to Tailwind slate-600 (slightly darker than summary) */
            margin-top: 0.75rem; /* Tailwind mt-3 */
            margin-bottom: 0.75rem; /* Tailwind mb-2 */
            /* font-style: italic; */
            font-weight: bold;
        }

        .paper-rating {
            margin-top: 1rem; /* Tailwind mt-4 */
            margin-bottom: 1rem; /* Tailwind mb-4 */
            color: #f59e0b; /* Tailwind amber-500 */
        }

        .paper-rating i {
            margin-right: 0.125rem; /* Tailwind mr-0.5 */
        }

        /* Apply consistent star color to sub-ratings */
        .paper-sub-ratings .rating-item i {
            color: #f59e0b; /* Match overall rating star color (amber-500) */
            margin-right: 0.125rem; /* Consistent spacing */
        }

    </style>
</head>
<body class="container mx-auto px-4 antialiased">

    <motion.div
        initial="{ opacity: 0, y: -30 }"
        animate="{ opacity: 1, y: 0 }"
        transition="{ duration: 0.6, ease: 'easeOut' }"
        class="header"
        data-motion-element
    >
        <h1>AIGC Daily Papers</h1>
        <p>Daily papers related to Image/Video/Multimodal Generation from cs.CV</p>
        <p>April 27, 2025</p>
        <div class="line-graphic mt-4 mb-8 mx-auto w-1/4"></div> <!-- Added line graphic -->
    </motion.div>

    <div class="bento-grid" id="paper-grid">
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.0, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Audio-Driven Talking Face Video Generation with Joint Uncertainty Learning</h2>
            <p class="paper-summary">Talking face video generation with arbitrary speech audio is a significant
challenge within the realm of digital human technology. The previous studies
have emphasized the significance of audio-lip synchronization and visual
quality. Currently, limited attention has been given to the learning of visual
uncertainty, which creates several issues in existing systems, including
inconsistent visual quality and unreliable performance across different input
conditions. To address the problem, we propose a Joint Uncertainty Learning
Network (JULNet) for high-quality talking face video generation, which
incorporates a representation of uncertainty that is directly related to visual
error. Specifically, we first design an uncertainty module to individually
predict the error map and uncertainty map after obtaining the generated image.
The error map represents the difference between the generated image and the
ground truth image, while the uncertainty map is used to predict the
probability of incorrect estimates. Furthermore, to match the uncertainty
distribution with the error distribution through a KL divergence term, we
introduce a histogram technique to approximate the distributions. By jointly
optimizing error and uncertainty, the performance and robustness of our model
can be enhanced. Extensive experiments demonstrate that our method achieves
superior high-fidelity and audio-lip synchronization in talking face video
generation compared to previous methods.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: this paper introduces a joint uncertainty learning network (julnet) for audio-driven talking face video generation, focusing on mitigating visual uncertainty to improve visual quality and audio-lip synchronization.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 本文介绍了一种用于音频驱动的说话人脸视频生成的联合不确定性学习网络（julnet），专注于减轻视觉不确定性，以提高视觉质量和音频唇部同步。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(8/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.18810v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Yifan Xie, Fei Ma, Yi Bin, Ying He, Fei Yu</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.05, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Validation and Calibration of Semi-Analytical Models for the Event Horizon Telescope Observations of Sagittarius A*</h2>
            <p class="paper-summary">The Event Horizon Telescope (EHT) enables the exploration of black hole
accretion flows at event-horizon scales. Fitting ray-traced physical models to
EHT observations requires the generation of synthetic images, a task that is
computationally demanding. This study leverages \alinet, a generative machine
learning model, to efficiently produce radiatively inefficient accretion flow
(RIAF) images as a function of the specified physical parameters. \alinet has
previously been shown to be able to interpolate black hole images and their
associated physical parameters after training on a computationally tractable
set of library images. We utilize this model to estimate the uncertainty
introduced by a number of anticipated unmodeled physical effects, including
interstellar scattering and intrinsic source variability. We then use this to
calibrate physical parameter estimates and their associated uncertainties from
RIAF model fits to mock EHT data via a library of general relativistic
magnetohydrodynamics models.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper utilizes a generative machine learning model (alinet) to efficiently generate black hole accretion flow images for event horizon telescope data analysis, addressing the computational cost of fitting physical models to observations and calibrating parameter estimates.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文利用生成式机器学习模型（alinet）高效生成黑洞吸积流图像，用于事件视界望远镜的数据分析，解决了将物理模型拟合到观测数据时的计算成本问题，并校准了参数估计。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(6/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star-half-alt"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(7/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.18624v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Ali SaraerToosi, Avery Broderick</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.1, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Video CLIP Model for Multi-View Echocardiography Interpretation</h2>
            <p class="paper-summary">Echocardiography involves recording videos of the heart using ultrasound,
enabling clinicians to evaluate its condition. Recent advances in large-scale
vision-language models (VLMs) have garnered attention for automating the
interpretation of echocardiographic videos. However, most existing VLMs
proposed for medical interpretation thus far rely on single-frame (i.e., image)
inputs. Consequently, these image-based models often exhibit lower diagnostic
accuracy for conditions identifiable through cardiac motion. Moreover,
echocardiographic videos are recorded from various views that depend on the
direction of ultrasound emission, and certain views are more suitable than
others for interpreting specific conditions. Incorporating multiple views could
potentially yield further improvements in accuracy. In this study, we developed
a video-language model that takes five different views and full video sequences
as input, training it on pairs of echocardiographic videos and clinical reports
from 60,747 cases. Our experiments demonstrate that this expanded approach
achieves higher interpretation accuracy than models trained with only
single-view videos or with still images.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: this paper introduces a video-language model using multiple echocardiography views and full video sequences to improve diagnostic accuracy in cardiac condition interpretation, outperforming single-view video or still image-based models.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 本文介绍了一种视频语言模型，该模型使用多个超声心动图视图和完整的视频序列来提高心脏状况解释的诊断准确性，优于基于单视图视频或静态图像的模型。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(5/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(6/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.18800v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Ryo Takizawa, Satoshi Kodera, Tempei Kabayama, Ryo Matsuoka, Yuta Ando, Yuto Nakamura, Haruki Settai, Norihiko Takeda</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.15000000000000002, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Dream-Box: Object-wise Outlier Generation for Out-of-Distribution Detection</h2>
            <p class="paper-summary">Deep neural networks have demonstrated great generalization capabilities for
tasks whose training and test sets are drawn from the same distribution.
Nevertheless, out-of-distribution (OOD) detection remains a challenging task
that has received significant attention in recent years. Specifically, OOD
detection refers to the detection of instances that do not belong to the
training distribution, while still having good performance on the
in-distribution task (e.g., classification or object detection). Recent work
has focused on generating synthetic outliers and using them to train an outlier
detector, generally achieving improved OOD detection than traditional OOD
methods. In this regard, outliers can be generated either in feature or pixel
space. Feature space driven methods have shown strong performance on both the
classification and object detection tasks, at the expense that the
visualization of training outliers remains unknown, making further analysis on
OOD failure modes challenging. On the other hand, pixel space outlier
generation techniques enabled by diffusion models have been used for image
classification using, providing improved OOD detection performance and outlier
visualization, although their adaption to the object detection task is as yet
unexplored. We therefore introduce Dream-Box, a method that provides a link to
object-wise outlier generation in the pixel space for OOD detection.
Specifically, we use diffusion models to generate object-wise outliers that are
used to train an object detector for an in-distribution task and OOD detection.
Our method achieves comparable performance to previous traditional methods
while being the first technique to provide concrete visualization of generated
OOD objects.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces dream-box, a method for object-wise outlier generation using diffusion models for out-of-distribution (ood) object detection, providing visualization of generated ood objects.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文介绍了 dream-box，一种使用扩散模型进行目标级别的异常值生成的方法，用于异常检测的目标检测，并提供了生成异常目标的视觉呈现。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(6/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(6/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(6/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.18746v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Brian K. S. Isaac-Medina, Toby P. Breckon</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.2, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">PiercingEye: Dual-Space Video Violence Detection with Hyperbolic Vision-Language Guidance</h2>
            <p class="paper-summary">Existing weakly supervised video violence detection (VVD) methods primarily
rely on Euclidean representation learning, which often struggles to distinguish
visually similar yet semantically distinct events due to limited hierarchical
modeling and insufficient ambiguous training samples. To address this
challenge, we propose PiercingEye, a novel dual-space learning framework that
synergizes Euclidean and hyperbolic geometries to enhance discriminative
feature representation. Specifically, PiercingEye introduces a layer-sensitive
hyperbolic aggregation strategy with hyperbolic Dirichlet energy constraints to
progressively model event hierarchies, and a cross-space attention mechanism to
facilitate complementary feature interactions between Euclidean and hyperbolic
spaces. Furthermore, to mitigate the scarcity of ambiguous samples, we leverage
large language models to generate logic-guided ambiguous event descriptions,
enabling explicit supervision through a hyperbolic vision-language contrastive
loss that prioritizes high-confusion samples via dynamic similarity-aware
weighting. Extensive experiments on XD-Violence and UCF-Crime benchmarks
demonstrate that PiercingEye achieves state-of-the-art performance, with
particularly strong results on a newly curated ambiguous event subset,
validating its superior capability in fine-grained violence detection.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces piercingeye, a dual-space (euclidean and hyperbolic) learning framework for video violence detection that utilizes large language models to generate ambiguous event descriptions for better training, achieving state-of-the-art performance.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文介绍了一种名为piercingeye的双空间（欧几里得和双曲）学习框架，用于视频暴力检测。该框架利用大型语言模型生成模糊事件描述以进行更好的训练，从而实现最先进的性能。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(3/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(6/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star-half-alt"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(5/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.18866v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Jiaxu Leng, Zhanjie Wu, Mingpi Tan, Mengjingcheng Mo, Jiankang Zheng, Qingqing Li, Ji Gan, Xinbo Gao</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.25, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Co-Training with Active Contrastive Learning and Meta-Pseudo-Labeling on 2D Projections for Deep Semi-Supervised Learning</h2>
            <p class="paper-summary">A major challenge that prevents the training of DL models is the limited
availability of accurately labeled data. This shortcoming is highlighted in
areas where data annotation becomes a time-consuming and error-prone task. In
this regard, SSL tackles this challenge by capitalizing on scarce labeled and
abundant unlabeled data; however, SoTA methods typically depend on pre-trained
features and large validation sets to learn effective representations for
classification tasks. In addition, the reduced set of labeled data is often
randomly sampled, neglecting the selection of more informative samples. Here,
we present active-DeepFA, a method that effectively combines CL,
teacher-student-based meta-pseudo-labeling and AL to train non-pretrained CNN
architectures for image classification in scenarios of scarcity of labeled and
abundance of unlabeled data. It integrates DeepFA into a co-training setup that
implements two cooperative networks to mitigate confirmation bias from
pseudo-labels. The method starts with a reduced set of labeled samples by
warming up the networks with supervised CL. Afterward and at regular epoch
intervals, label propagation is performed on the 2D projections of the
networks' deep features. Next, the most reliable pseudo-labels are exchanged
between networks in a cross-training fashion, while the most meaningful samples
are annotated and added into the labeled set. The networks independently
minimize an objective loss function comprising supervised contrastive,
supervised and semi-supervised loss components, enhancing the representations
towards image classification. Our approach is evaluated on three challenging
biological image datasets using only 5% of labeled samples, improving baselines
and outperforming six other SoTA methods. In addition, it reduces annotation
effort by achieving comparable results to those of its counterparts with only
3% of labeled data.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: this paper proposes an active co-training method (active-deepfa) combining contrastive learning, meta-pseudo-labeling, and active learning on 2d feature projections to train cnns for image classification with limited labeled data, achieving sota results on biological image datasets.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 本文提出了一种主动协同训练方法 (active-deepfa)，结合了对比学习、元伪标签和在 2d 特征投影上的主动学习，用于在标记数据有限的情况下训练 cnn 进行图像分类，并在生物图像数据集上取得了 sota 效果。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(3/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(6/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star-half-alt"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(5/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.18666v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: David Aparco-Cardenas, Jancarlo F. Gomes, Alexandre X. Falcão, Pedro J. de Rezende</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.30000000000000004, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Dexonomy: Synthesizing All Dexterous Grasp Types in a Grasp Taxonomy</h2>
            <p class="paper-summary">Generalizable dexterous grasping with suitable grasp types is a fundamental
skill for intelligent robots. Developing such skills requires a large-scale and
high-quality dataset that covers numerous grasp types (i.e., at least those
categorized by the GRASP taxonomy), but collecting such data is extremely
challenging. Existing automatic grasp synthesis methods are often limited to
specific grasp types or object categories, hindering scalability. This work
proposes an efficient pipeline capable of synthesizing contact-rich,
penetration-free, and physically plausible grasps for any grasp type, object,
and articulated hand. Starting from a single human-annotated template for each
hand and grasp type, our pipeline tackles the complicated synthesis problem
with two stages: optimize the object to fit the hand template first, and then
locally refine the hand to fit the object in simulation. To validate the
synthesized grasps, we introduce a contact-aware control strategy that allows
the hand to apply the appropriate force at each contact point to the object.
Those validated grasps can also be used as new grasp templates to facilitate
future synthesis. Experiments show that our method significantly outperforms
previous type-unaware grasp synthesis baselines in simulation. Using our
algorithm, we construct a dataset containing 10.7k objects and 9.5M grasps,
covering 31 grasp types in the GRASP taxonomy. Finally, we train a
type-conditional generative model that successfully performs the desired grasp
type from single-view object point clouds, achieving an 82.3% success rate in
real-world experiments. Project page: https://pku-epic.github.io/Dexonomy.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: this paper introduces a method for synthesizing a large-scale dataset of dexterous grasps covering the grasp taxonomy, and demonstrates its use by training a type-conditional generative model for grasping from single-view object point clouds, achieving good real-world performance.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 本文介绍了一种合成大规模灵巧抓取数据集的方法，该数据集涵盖了grasp分类，并通过训练一个类型条件生成模型，从单视图物体点云中进行抓取，证明了其在现实世界中的良好性能。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(2/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(4/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.18829v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Jiayi Chen, Yubin Ke, Lin Peng, He Wang</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.35000000000000003, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Depth as Points: Center Point-based Depth Estimation</h2>
            <p class="paper-summary">The perception of vehicles and pedestrians in urban scenarios is crucial for
autonomous driving. This process typically involves complicated data
collection, imposes high computational and hardware demands. To address these
limitations, we first develop a highly efficient method for generating virtual
datasets, which enables the creation of task- and scenario-specific datasets in
a short time. Leveraging this method, we construct the virtual depth estimation
dataset VirDepth, a large-scale, multi-task autonomous driving dataset.
Subsequently, we propose CenterDepth, a lightweight architecture for monocular
depth estimation that ensures high operational efficiency and exhibits superior
performance in depth estimation tasks with highly imbalanced height-scale
distributions. CenterDepth integrates global semantic information through the
innovative Center FC-CRFs algorithm, aggregates multi-scale features based on
object key points, and enables detection-based depth estimation of targets.
Experiments demonstrate that our proposed method achieves superior performance
in terms of both computational speed and prediction accuracy.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: this paper introduces a method for generating virtual depth estimation datasets and a lightweight architecture called centerdepth for monocular depth estimation, achieving high efficiency and accuracy, especially in imbalanced height-scale distributions.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 本文介绍了一种生成虚拟深度估计数据集的方法，以及一种名为centerdepth的轻量级单目深度估计架构，该架构具有高效和准确性，尤其是在高度比例分布不平衡的情况下。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(2/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(6/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(4/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.18773v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Zhiheng Tu, Xinjian Huang, Yong He, Ruiyang Zhou, Bo Du, Weitao Wu</p>
            
        </motion.div>
        
    </div>

    <footer class="footer">
        Generated on 2025-04-29 04:29:01 UTC. Powered by <a href="https://github.com/onion-liu" target="_blank">onion-liu</a>.
    </footer>

</body>
</html>